import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
df_CSF_inf = pd.read_csv('data/Inflammation_CSF_Etter.csv',sep=';',skiprows=[0,1,2,4,5,6] , index_col=0)
df_CSF_inf.drop(columns=['Plate ID' , 'QC Warning'] , inplace=True)
df_CSF_inf_metadatas = df_CSF_inf.loc[ ['LOD','Missing Data freq.','Normalization'] , :].copy()
df_CSF_inf.drop(index=['LOD','Missing Data freq.','Normalization', np.nan] , inplace=True)
inflammationMolecules = list( df_CSF_inf.columns)
for m in inflammationMolecules:
df_CSF_inf[m] = df_CSF_inf[m].astype(float)
df_CSF_inf.head()
df_CSF_neuro = pd.read_csv('data/Neurology_CSF_Etter.csv',sep=';',skiprows=[0,1,2,4,5,6] , index_col=0)
df_CSF_neuro.drop(columns=['Plate ID' , 'QC Warning'] , inplace=True)
df_CSF_neuro_metadatas = df_CSF_neuro.loc[ ['LOD','Missing Data freq.','Normalization'] , :].copy()
df_CSF_neuro.drop(index=['LOD','Missing Data freq.','Normalization', np.nan] , inplace=True)
neuroMolecules = list( df_CSF_neuro.columns )
for m in neuroMolecules:
df_CSF_neuro[m] = df_CSF_neuro[m].astype(float)
df_CSF_neuro.head()
df_Plasma_neuro = pd.read_csv('data/Neurology_Plasma_Etter.csv',sep=';',skiprows=[0,1,2,4,5,6] , index_col=0)
df_Plasma_neuro.drop(columns=['Plate ID' , 'QC Warning'] , inplace=True)
df_Plasma_neuro_metadatas = df_Plasma_neuro.loc[ ['LOD','Missing Data freq.','Normalization'] , :].copy()
df_Plasma_neuro.drop(index=['LOD','Missing Data freq.','Normalization', np.nan] , inplace=True)
for m in neuroMolecules:
df_Plasma_neuro[m] = df_Plasma_neuro[m].astype(float)
df_Plasma_neuro.head()
df_Plasma_inf = pd.read_csv('data/Inflammation_Plasma_Etter.csv',sep=';',skiprows=[0,1,2,4,5,6] , index_col=0)
df_Plasma_inf.drop(columns=['Plate ID' , 'QC Warning'] , inplace=True)
df_Plasma_inf_metadatas = df_Plasma_inf.loc[ ['LOD','Missing Data freq.','Normalization'] , :].copy()
df_Plasma_inf.drop(index=['LOD','Missing Data freq.','Normalization', np.nan] , inplace=True)
for m in inflammationMolecules:
df_Plasma_inf[m] = df_Plasma_inf[m].astype(float)
df_Plasma_inf.head()
Are the index the same ?
plasmaSamples = set(df_Plasma_inf.index)
csfSamples = set(df_CSF_inf.index)
print( 'csfSamples-plasmaSamples',csfSamples-plasmaSamples )
print( 'plasmaSamples-csfSamples' , plasmaSamples-csfSamples )
4 missing CSF sample (expected from the metadata)
Are the CSF and plasma measurements correlated ?
# for c in inflammationMolecules:
# x = df_Plasma_inf.loc[df_CSF_inf.index , c]
# y = df_CSF_inf.loc[df_CSF_inf.index , c]
# r,p = stats.pearsonr(list(x),list(y))
# fig,ax = plt.subplots(1,1)
# ax.plot(x,y,'o')
# ax.set_xlabel("plasma")
# ax.set_ylabel("CSF")
# ax.set_title( '{} NPX - pearson correlation : {:.2f} '.format(c,r) )
# fig.savefig('images/preliminary/'+c+'.plasma_CSF.png')
# for c in neuroMolecules:
# x = df_Plasma_neuro.loc[df_CSF_inf.index , c]
# y = df_CSF_neuro.loc[df_CSF_inf.index , c]
# r,p = stats.pearsonr(list(x),list(y))
# fig,ax = plt.subplots(1,1)
# ax.plot(x,y,'o')
# ax.set_xlabel("plasma")
# ax.set_ylabel("CSF")
# ax.set_title( '{} NPX - pearson correlation : {:.2f} '.format(c,r) )
# fig.savefig('images/preliminary/'+c+'.plasma_CSF.png')
df_metadata = pd.read_csv('data/metadata.csv', index_col='studyID')
df_metadata.covid = df_metadata.covid.astype(bool)
df_metadata.Age = df_metadata.Age.astype(int)
df_metadata.head()
DF = df_CSF_inf
DFMD = df_CSF_inf_metadatas
MOL = inflammationMolecules
if DFMD.shape[0]==3 :
DFMD = DFMD.transpose()
DFMD.LOD = DFMD.LOD.astype(float)
underThreshold = DF<DFMD.LOD
DF[underThreshold] = np.nan
N = underThreshold.shape[0] * underThreshold.shape[1]
n = underThreshold.sum().sum()
print("number of measurment under detection threshold : {} / {}".format(n,N) )
fig,axes = plt.subplots(1,3,figsize=(16,8))
axes[0].hist(underThreshold.sum())
axes[0].set_title('NAs per molecule')
axes[1].hist(underThreshold.sum(axis=1))
axes[1].set_title('NAs per sample')
axes[2].scatter( DF.isnull().sum() / DF.shape[0] , DF.median() )
axes[2].set_ylabel('median NPX')
axes[2].set_xlabel('fraction of NAs')
DF = df_Plasma_inf
DFMD = df_Plasma_inf_metadatas
MOL = inflammationMolecules
if DFMD.shape[0]==3 :
DFMD = DFMD.transpose()
DFMD.LOD = DFMD.LOD.astype(float)
underThreshold = DF<DFMD.LOD
DF[underThreshold] = np.nan
N = underThreshold.shape[0] * underThreshold.shape[1]
n = underThreshold.sum().sum()
print("number of measurment under detection threshold : {} / {}".format(n,N) )
fig,axes = plt.subplots(1,3,figsize=(16,8))
axes[0].hist(underThreshold.sum())
axes[0].set_title('NAs per molecule')
axes[1].hist(underThreshold.sum(axis=1))
axes[1].set_title('NAs per sample')
axes[2].scatter( DF.isnull().sum() / DF.shape[0] , DF.median() )
axes[2].set_ylabel('median NPX')
axes[2].set_xlabel('fraction of NAs')
DF = df_CSF_neuro
DFMD = df_CSF_neuro_metadatas
MOL = neuroMolecules
if DFMD.shape[0]==3 :
DFMD = DFMD.transpose()
DFMD.LOD = DFMD.LOD.astype(float)
underThreshold = DF<DFMD.LOD
DF[underThreshold] = np.nan
N = underThreshold.shape[0] * underThreshold.shape[1]
n = underThreshold.sum().sum()
print("number of measurment under detection threshold : {} / {}".format(n,N) )
fig,axes = plt.subplots(1,3,figsize=(16,8))
axes[0].hist(underThreshold.sum())
axes[0].set_title('NAs per molecule')
axes[1].hist(underThreshold.sum(axis=1))
axes[1].set_title('NAs per sample')
axes[2].scatter( DF.isnull().sum() / DF.shape[0] , DF.median() )
axes[2].set_ylabel('median NPX')
axes[2].set_xlabel('fraction of NAs')
DF = df_Plasma_neuro
DFMD = df_Plasma_neuro_metadatas
MOL = neuroMolecules
if DFMD.shape[0]==3 :
DFMD = DFMD.transpose()
DFMD.LOD = DFMD.LOD.astype(float)
underThreshold = DF<DFMD.LOD
DF[underThreshold] = np.nan
N = underThreshold.shape[0] * underThreshold.shape[1]
n = underThreshold.sum().sum()
print("number of measurment under detection threshold : {} / {}".format(n,N) )
fig,axes = plt.subplots(1,3,figsize=(16,8))
axes[0].hist(underThreshold.sum())
axes[0].set_title('NAs per molecule')
axes[1].hist(underThreshold.sum(axis=1))
axes[1].set_title('NAs per sample')
axes[2].scatter( DF.isnull().sum() / DF.shape[0] , DF.median() )
axes[2].set_ylabel('median NPX')
axes[2].set_xlabel('fraction of NAs')
The inflammatory panel contains much more measurments under the detection thresholds than the neuro panel.
Also, as experienced in previous experiments the fraction of NAs are associated with a lower median expression.
Proposed strategy :
We will first drop the columns which have no non-NAs:
def imputeOLINKdata( DF, DFMD ,ax):
if DFMD.shape[0]==3 :
DFMD = DFMD.transpose()
DFMD.LOD = DFMD.LOD.astype(float)
toDrop = list( DF.columns[ (DF.isnull().sum() / DF.shape[1]) == 1 ] )
if toDrop == 1 :
print('dropping columns', toDrop)
DF.drop(columns = toDrop , inplace=True)
else:
print('no columns to drop')
I1 = ( DF.sum() * 0.1/0.9 ) / DF.isnull().sum()
I2 = 0.5 * DFMD.loc[ DF.columns , "LOD"]
imputationValues = np.minimum( I1 , I2 )
ax.scatter( DF.isnull().sum() / DF.shape[0], I1 , label="NA sum == 0.1 colSum")
ax.scatter( DF.isnull().sum() / DF.shape[0] , I2 , label = "0.5 * detection threshold" )
ax.set_ylim(0.0,10.0)
ax.set_xlabel( "NAs fraction" )
ax.set_ylabel( "imputation NPX" )
ax.legend()
DFImputed = DF.copy()
DFImputed.fillna(imputationValues , inplace = True)
#print( 'number of NAs after imputation: ' , DFImputed.isnull().sum().sum() )
return DFImputed
fig,axes = plt.subplots(2,2,figsize = (14,10))
df_Plasma_neuro_imputed = imputeOLINKdata( df_Plasma_neuro, df_Plasma_neuro_metadatas ,axes[0][0])
axes[0][0].set_title("Plasma - neuro")
df_Plasma_inf_imputed = imputeOLINKdata( df_Plasma_inf, df_Plasma_inf_metadatas ,axes[0][1])
axes[0][1].set_title("Plasma - inflammatory")
df_CSF_neuro_imputed = imputeOLINKdata( df_CSF_neuro, df_CSF_neuro_metadatas ,axes[1][0])
axes[1][0].set_title("CSF - neuro")
df_CSF_inf_imputed = imputeOLINKdata( df_CSF_inf, df_CSF_inf_metadatas ,axes[1][1])
axes[1][1].set_title("CSF - inflammatory")
for c in inflammationMolecules:
x = df_Plasma_inf_imputed.loc[df_CSF_inf_imputed.index , c]
y = df_CSF_inf_imputed.loc[df_CSF_inf_imputed.index , c]
r,p = stats.pearsonr(list(x),list(y))
t,p = stats.kendalltau(list(x),list(y))
fig,ax = plt.subplots(1,1)
sns.scatterplot( x=x,y=y, hue= df_metadata.Group[ df_CSF_inf_imputed.index ] , ax = ax)
#ax.plot(x,y,'o')
ax.set_xlabel("plasma")
ax.set_ylabel("CSF")
ax.set_title( '{} NPX - pearson : {:.2f} - kendall : {:.2f} '.format(c,r,t) )
fig.savefig('images/imputed/inflammatory_'+c+'.plasma_CSF.png')
for c in neuroMolecules:
x = df_Plasma_neuro_imputed.loc[df_CSF_neuro_imputed.index , c]
y = df_CSF_neuro_imputed.loc[df_CSF_neuro_imputed.index , c]
r,p = stats.pearsonr(list(x),list(y))
t,p = stats.kendalltau(list(x),list(y))
fig,ax = plt.subplots(1,1)
sns.scatterplot( x=x,y=y, hue= df_metadata.Group[ df_CSF_neuro_imputed.index ] , ax = ax)
#ax.plot(x,y,'o')
ax.set_xlabel("plasma")
ax.set_ylabel("CSF")
ax.set_title( '{} NPX - pearson : {:.2f} - kendall : {:.2f} '.format(c,r,t) )
fig.savefig('images/imputed/neuro_'+c+'.plasma_CSF.png')
It would appear that for most markers the correlation is present but not very strong.
Some markers seems to have different discriminating abilities depending on whether the sample comes from plasma or CSF.
Some columns have a very high number of NAs... I should be especially careful with these and any conclusions that may hang on them.
from sklearn import decomposition
from sklearn import preprocessing
data = df_CSF_neuro_imputed
# scale
scaler = preprocessing.StandardScaler().fit(data)
data_scaled = scaler.transform(data)
pca = decomposition.PCA(n_components=20)
pca.fit(data_scaled)
data_pca_transformed = pca.transform(data_scaled)
fig, axes = plt.subplots(2,1 , figsize = (14,7))
axes[0].plot( pca.explained_variance_ratio_ , marker = 'o')
axes[0].set_xlabel('number of components')
axes[0].set_ylabel('fraction explained variance')
axes[1].plot( np.cumsum( pca.explained_variance_ratio_ ) , marker = 'o')
axes[1].axhline(0.8 , color = 'grey')
axes[1].set_xlabel('number of components')
axes[1].set_ylabel('fraction explained variance (cumulative)')
data_pca = pd.DataFrame ( data_pca_transformed )
data_pca.index = data.index
data_pca = data_pca.join(df_metadata.loc[ data_pca.index ] )
import plotly.express as px
fig = px.scatter(data_pca , x= 0 , y=1 ,
color = 'Group' , symbol = 'Sex',
hover_data = ['Age','Sex','Stage','Group'] )
fig.show()
fig.write_html("interactive_figures/PCA_CSF_neuro.html")
f, ax = plt.subplots(figsize=(16, 6))
sns.heatmap( pca.components_ , ax = ax , xticklabels = data.columns , cmap ='viridis')
fig = px.scatter_3d(data_pca , x= 0 , y=1 , z=2,
color = 'Group' , symbol = 'Sex',
hover_data = ['Age','Sex','Stage','Group'] ,
labels = {'0':"PCA 0 - {:.2f}".format(pca.explained_variance_ratio_[0]) ,
'1':"PCA 1 - {:.2f}".format(pca.explained_variance_ratio_[1]) ,
'2':"PCA 2 - {:.2f}".format(pca.explained_variance_ratio_[2]) } )
fig.show()
fig.write_html("interactive_figures/PCA_CSF_neuro_3D.html")
print( "showing {:.3f} of the variance".format( sum( pca.explained_variance_ratio_[:3] ) ) )
from sklearn import decomposition
from sklearn import preprocessing
data = df_Plasma_neuro_imputed
# scale
scaler = preprocessing.StandardScaler().fit(data)
data_scaled = scaler.transform(data)
pca = decomposition.PCA(n_components=20)
pca.fit(data_scaled)
data_pca_transformed = pca.transform(data_scaled)
fig, axes = plt.subplots(2,1 , figsize = (14,7))
axes[0].plot( pca.explained_variance_ratio_ , marker = 'o')
axes[0].set_xlabel('number of components')
axes[0].set_ylabel('fraction explained variance')
axes[1].plot( np.cumsum( pca.explained_variance_ratio_ ) , marker = 'o')
axes[1].axhline(0.8 , color = 'grey')
axes[1].set_xlabel('number of components')
axes[1].set_ylabel('fraction explained variance (cumulative)')
data_pca = pd.DataFrame ( data_pca_transformed )
data_pca.index = data.index
data_pca = data_pca.join(df_metadata.loc[ data_pca.index ] )
import plotly.express as px
fig = px.scatter(data_pca , x= 0 , y=1 ,
color = 'covid' ,
hover_data = ['Age','Sex','Stage','Group'] )
fig.show()
fig.write_html("interactive_figures/PCA_Plasma_neuro.html")
f, ax = plt.subplots(figsize=(16, 6))
sns.heatmap( pca.components_ , ax = ax , xticklabels = data.columns , cmap ='viridis')
fig = px.scatter_3d(data_pca , x= 0 , y=1 , z=2,
color = 'Group' , symbol = 'Sex',
hover_data = ['Age','Sex','Stage','Group'] ,
labels = {'0':"PCA 0 - {:.2f}".format(pca.explained_variance_ratio_[0]) ,
'1':"PCA 1 - {:.2f}".format(pca.explained_variance_ratio_[1]) ,
'2':"PCA 2 - {:.2f}".format(pca.explained_variance_ratio_[2]) } )
fig.show()
fig.write_html("interactive_figures/PCA_Plasma_neuro_3D.html")
print( "showing {:.3f} of the variance".format( sum( pca.explained_variance_ratio_[:3] ) ) )
from sklearn import decomposition
from sklearn import preprocessing
data = df_CSF_inf_imputed
# scale
scaler = preprocessing.StandardScaler().fit(data)
data_scaled = scaler.transform(data)
pca = decomposition.PCA(n_components=20)
pca.fit(data_scaled)
data_pca_transformed = pca.transform(data_scaled)
fig, axes = plt.subplots(2,1 , figsize = (14,7))
axes[0].plot( pca.explained_variance_ratio_ , marker = 'o')
axes[0].set_xlabel('number of components')
axes[0].set_ylabel('fraction explained variance')
axes[1].plot( np.cumsum( pca.explained_variance_ratio_ ) , marker = 'o')
axes[1].axhline(0.8 , color = 'grey')
axes[1].set_xlabel('number of components')
axes[1].set_ylabel('fraction explained variance (cumulative)')
data_pca = pd.DataFrame ( data_pca_transformed )
data_pca.index = data.index
data_pca = data_pca.join(df_metadata.loc[ data_pca.index ] )
import plotly.express as px
fig = px.scatter(data_pca , x= 0 , y=1 ,
color = 'Group' ,
hover_data = ['Age','Sex','Stage','Group'] )
fig.show()
fig.write_html("interactive_figures/PCA_CSF_inf.html")
f, ax = plt.subplots(figsize=(16, 6))
sns.heatmap( pca.components_ , ax = ax , xticklabels = data.columns , cmap ='viridis')
fig = px.scatter_3d(data_pca , x= 0 , y=1 , z=2,
color = 'Group' , symbol = 'Sex',
hover_data = ['Age','Sex','Stage','Group'] ,
labels = {'0':"PCA 0 - {:.2f}".format(pca.explained_variance_ratio_[0]) ,
'1':"PCA 1 - {:.2f}".format(pca.explained_variance_ratio_[1]) ,
'2':"PCA 2 - {:.2f}".format(pca.explained_variance_ratio_[2]) } )
fig.show()
fig.write_html("interactive_figures/PCA_CSF_inflammatory_3D.html")
print( "showing {:.3f} of the variance".format( sum( pca.explained_variance_ratio_[:3] ) ) )
from sklearn import decomposition
from sklearn import preprocessing
data = df_CSF_neuro_imputed
# scale
scaler = preprocessing.StandardScaler().fit(data)
data_scaled = scaler.transform(data)
pca = decomposition.PCA(n_components=20)
pca.fit(data_scaled)
data_pca_transformed = pca.transform(data_scaled)
fig, axes = plt.subplots(2,1 , figsize = (14,7))
axes[0].plot( pca.explained_variance_ratio_ , marker = 'o')
axes[0].set_xlabel('number of components')
axes[0].set_ylabel('fraction explained variance')
axes[1].plot( np.cumsum( pca.explained_variance_ratio_ ) , marker = 'o')
axes[1].axhline(0.8 , color = 'grey')
axes[1].set_xlabel('number of components')
axes[1].set_ylabel('fraction explained variance (cumulative)')
data_pca = pd.DataFrame ( data_pca_transformed )
data_pca.index = data.index
data_pca = data_pca.join(df_metadata.loc[ data_pca.index ] )
import plotly.express as px
fig = px.scatter(data_pca , x= 0 , y=1 ,
color = 'Group' ,
hover_data = ['Age','Sex','Stage','Group'] )
fig.show()
fig.write_html("interactive_figures/PCA_CSF_neuro.html")
f, ax = plt.subplots(figsize=(16, 6))
sns.heatmap( pca.components_ , ax = ax , xticklabels = data.columns , cmap ='viridis')
fig = px.scatter_3d(data_pca , x= 0 , y=1 , z=2,
color = 'Group' , symbol = 'Sex',
hover_data = ['Age','Sex','Stage','Group'] ,
labels = {'0':"PCA 0 - {:.2f}".format(pca.explained_variance_ratio_[0]) ,
'1':"PCA 1 - {:.2f}".format(pca.explained_variance_ratio_[1]) ,
'2':"PCA 2 - {:.2f}".format(pca.explained_variance_ratio_[2]) } )
fig.show()
fig.write_html("interactive_figures/PCA_CSF_neuro_3D.html")
print( "showing {:.3f} of the variance".format( sum( pca.explained_variance_ratio_[:3] ) ) )
commonMarkers = list ( set( df_CSF_neuro_imputed.columns ).intersection( df_CSF_inf_imputed.columns ) )
commonMarkers
fig, axes = plt.subplots(2,len(commonMarkers) , figsize = (14,14) )
for i,c in enumerate(commonMarkers):
axes[0][i].plot( df_CSF_neuro_imputed.loc[df_CSF_neuro_imputed.index , c] ,
df_CSF_inf_imputed.loc[df_CSF_neuro_imputed.index , c] , 'o')
axes[0][i].set_xlabel('CSF neuro '+c)
axes[0][i].set_ylabel('CSF inflammatory '+c)
axes[1][i].plot( df_Plasma_neuro_imputed.loc[df_Plasma_neuro_imputed.index , c] ,
df_Plasma_inf_imputed.loc[df_Plasma_neuro_imputed.index , c] , 'o' )
axes[1][i].set_xlabel('Plasma neuro '+c)
axes[1][i].set_ylabel('Plasma inflammatory '+c)
These are heavily imputed columns in both cases, so this is not very informative ...
## let's keep only the neuro copy of the 2 common markers
inflammationMoleculesOfInterest = inflammationMolecules.copy()
for c in commonMarkers:
inflammationMoleculesOfInterest.remove(c)
tmpCSF = pd.concat( [ df_CSF_neuro_imputed, df_CSF_inf_imputed.loc[:,inflammationMoleculesOfInterest] ] , axis = 1 )
tmpCSF.columns = 'CSF_' + tmpCSF.columns
print(tmpCSF.shape)
tmpCSF.head()
tmpPlasma = pd.concat( [ df_Plasma_neuro_imputed, df_Plasma_inf_imputed.loc[:,inflammationMoleculesOfInterest] ] , axis = 1 )
tmpPlasma.columns = 'Plasma_' + tmpPlasma.columns
print(tmpPlasma.shape)
tmpPlasma.head()
total = pd.concat( [tmpCSF , tmpPlasma] ,axis=1)
total.shape
df_CSF_inf_imputed.to_csv( 'imputed_data/Inflammation_CSF_imputed.csv' )
df_CSF_neuro_imputed.to_csv('imputed_data/Neurology_CSF_imputed.csv')
df_Plasma_inf_imputed.to_csv( 'imputed_data/Inflammation_Plasma_imputed.csv' )
df_Plasma_neuro_imputed.to_csv('imputed_data/Neurology_Plasma_imputed.csv')
total.to_csv( 'imputed_data/ALL_imputed.csv' )